import pandas as pd
import plotly.express as px
import numpy as np
from plotly.subplots import make_subplots
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import matplotlib.pyplot as plt
from PyNomaly import loop
from statsmodels.robust import scale
DatosGm=pd.read_csv("https://raw.githubusercontent.com/lihkir/Uninorte/main/AppliedStatisticMS/DataVisualizationRPython/Lectures/Python/PythonDataSets/gapminder.csv")
DatosCo2=pd.read_csv("https://raw.githubusercontent.com/lihkir/Uninorte/main/AppliedStatisticMS/DataVisualizationRPython/Lectures/Python/PythonDataSets/co2.csv")
DatosGm.columns
Index(['Country', 'Year', 'fertility', 'life', 'population', 'child_mortality',
'gdp', 'region'],
dtype='object')
DatosCo2.columns
Index(['country', '1800', '1801', '1802', '1803', '1804', '1805', '1806',
'1807', '1808',
...
'2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
'2014'],
dtype='object', length=216)
DatosGm.drop(['child_mortality','gdp', 'region'],inplace=True,axis=1)
DatosGm["Year"].min()
1964
Teniendo en cuenta que para el dataframe de los datos de Gapminder solo se registran despues del 1964 se procede a dropear años anteriores los cuales presentan bastantes Nan y puede no sean utiles debido a lo distantes que son.
melted_df = pd.melt(DatosCo2, id_vars=['country'], var_name='Year', value_name='Co2')
melted_df["Year"]=pd.to_numeric(melted_df["Year"])
DatosCo2.new=melted_df[melted_df["Year"]>=1964]
C:\Users\g_a09\AppData\Local\Temp\ipykernel_15220\2637862456.py:1: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access DatosCo2.new=melted_df[melted_df["Year"]>=1964]
DatosCo2.new.head()
| country | Year | Co2 | |
|---|---|---|---|
| 31488 | Afghanistan | 1964 | 0.0863 |
| 31489 | Albania | 1964 | 1.0900 |
| 31490 | Algeria | 1964 | 0.4600 |
| 31491 | Andorra | 1964 | NaN |
| 31492 | Angola | 1964 | 0.2010 |
DatosCo2.new=DatosCo2.new.rename(columns={"country":"Country"})
DatosCo2.new.columns
Index(['Country', 'Year', 'Co2'], dtype='object')
Se procede a realizar el merge de ambos df para poder analizar los datos y encontrar correlaciones entre las caracteristicas.
merged_df = pd.merge(DatosCo2.new, DatosGm, on=['Country', 'Year'])
merged_df.head()
| Country | Year | Co2 | fertility | life | population | |
|---|---|---|---|---|---|---|
| 0 | Afghanistan | 1964 | 0.0863 | 7.671 | 33.639 | 10474903.0 |
| 1 | Albania | 1964 | 1.0900 | 5.711 | 65.475 | 1817098.0 |
| 2 | Algeria | 1964 | 0.4600 | 7.653 | 47.953 | 11654905.0 |
| 3 | Angola | 1964 | 0.2010 | 7.425 | 34.604 | 5337063.0 |
| 4 | Antigua and Barbuda | 1964 | 1.5400 | 4.250 | 63.775 | 58653.0 |
Luego de realizar el merge y tener el df de una manera practica a analizar, se procede a realizar las comparaciones de las caracteristicas.
fig = px.scatter(merged_df, x="life", y="Co2", hover_name="Country", animation_frame="Year", range_x=[merged_df["life"].min(),
merged_df["life"].max()], range_y=[merged_df["Co2"].min(), merged_df["Co2"].max()])
# Muestra el gráfico
fig.show()
Se puede apreciar que no exite una aparente correlacion entre las caracteristicas de Co2 y life, se comprueba mediante el metodo .corr()
correlacion=merged_df["Co2"].corr(merged_df["life"])
print(correlacion)
0.40288934295677575
Se puede apreciar que mediante el motodo corr que existe una correlacion muy baja entre estas 2 caracteristicas
fig = px.scatter(merged_df, x="fertility", y="Co2", hover_name="Country", animation_frame="Year", range_x=[merged_df["fertility"].min(),
merged_df["fertility"].max()], range_y=[merged_df["Co2"].min(), merged_df["Co2"].max()])
# Muestra el gráfico
fig.show()
Se realiza lo mismo previamente hecho con la anterior caracteristica para fertility
correlacion=merged_df["Co2"].corr(merged_df["fertility"])
print(correlacion)
-0.31439742304488627
Se puede apreciar que mediante el motodo corr que existe una correlacion muy baja entre estas 2 caracteristicas
Diabetes=pd.read_csv("https://raw.githubusercontent.com/lihkir/Data/main/diabetes.csv")
Diabetes.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
En primera instancia se aprecia que hay algunos valores en 0 para variables medicas que son imposibles que sean 0, por ende son atos faltantes, se proceden a reemplazarse por NAN
Diabetes.loc[Diabetes["Glucose"] == 0.0, "Glucose"] = np.NAN
Diabetes.loc[Diabetes["BloodPressure"] == 0.0, "BloodPressure"] = np.NAN
Diabetes.loc[Diabetes["SkinThickness"] == 0.0, "SkinThickness"] = np.NAN
Diabetes.loc[Diabetes["Insulin"] == 0.0, "Insulin"] = np.NAN
Diabetes.loc[Diabetes["BMI"] == 0.0, "BMI"] = np.NAN
Nos aseguramos que todos los datos esten en formato numerico
df = Diabetes.apply(pd.to_numeric, errors='coerce')
df.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148.0 | 72.0 | 35.0 | NaN | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85.0 | 66.0 | 29.0 | NaN | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183.0 | 64.0 | NaN | NaN | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89.0 | 66.0 | 23.0 | 94.0 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137.0 | 40.0 | 35.0 | 168.0 | 43.1 | 2.288 | 33 | 1 |
for columna in df.columns:
n_miss = df[columna].isnull().sum()
perc = n_miss / df.shape[0] * 100
print(f"> Columna '{columna}', Faltantes: {n_miss} ({perc:.1f}%)")
> Columna 'Pregnancies', Faltantes: 0 (0.0%) > Columna 'Glucose', Faltantes: 5 (0.7%) > Columna 'BloodPressure', Faltantes: 35 (4.6%) > Columna 'SkinThickness', Faltantes: 227 (29.6%) > Columna 'Insulin', Faltantes: 374 (48.7%) > Columna 'BMI', Faltantes: 11 (1.4%) > Columna 'DiabetesPedigreeFunction', Faltantes: 0 (0.0%) > Columna 'Age', Faltantes: 0 (0.0%) > Columna 'Outcome', Faltantes: 0 (0.0%)
La caracteristica Glucose no presenta casi faltantes, en cambio bloodpressure, skinthickness e insulin, presentan una cantidad de datos faltantes considerables
fig1 = px.box(df, x='Outcome', y='Pregnancies', notched=False,title="Pregnancies")
fig2 = px.box(df, x='Outcome', y='Glucose', notched=False,title="Glucose")
fig3 = px.box(df, x='Outcome', y='BloodPressure', notched=False,title="PregnancBloodPressureies")
fig4 = px.box(df, x='Outcome', y='SkinThickness', notched=False,title="SkinThickness")
fig5 = px.box(df, x='Outcome', y='Insulin', notched=False,title="Insulin")
fig6 = px.box(df, x='Outcome', y='BMI', notched=False,title="BMI")
fig7 = px.box(df, x='Outcome', y='DiabetesPedigreeFunction', notched=False,title="DiabetesPedigreeFunction")
fig8 = px.box(df, x='Outcome', y='Age', notched=False,title="Age")
fig=make_subplots(rows=2,cols=4)
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig3['data'][0], row=1, col=3)
fig.add_trace(fig4['data'][0], row=1, col=4)
fig.add_trace(fig5['data'][0], row=2, col=1)
fig.add_trace(fig6['data'][0], row=2, col=2)
fig.add_trace(fig7['data'][0], row=2, col=3)
fig.add_trace(fig8['data'][0], row=2, col=4)
fig.update_layout(title='Boxplots de diferentes características por Outcome')
fig.show()
Se puede ver preliminarmente que la mayoria de caracteristicas presentan datos "atipicos" al realizar un boxplot con respecto el outcome, teniendo en cuenta que hay 3 en especifico que tiene muchos Nan lo cual puede afectar dichas graficas y datos "atipicos"
Usando Transformación de Datos con IterativeImputer para los datos faltantes
imputer = IterativeImputer(n_nearest_features=None, imputation_order='ascending')
data = df.values
ix = [i for i in range(data.shape[1])]
X, y = data[:, ix], data[:, 8]
print('Missing: %d' % np.sum(np.isnan(X).flatten()))
Missing: 652
imputer = IterativeImputer()
imputer.fit(X)
Xtrans = imputer.transform(X)
print('Missing: %d' % np.sum(np.isnan(Xtrans).flatten()))
Missing: 0
Creando Pipeline
Dado que los valores que queremos imputar son todos del tipo decimal, necesitamos una regresion
model = RandomForestRegressor()
imputer = IterativeImputer()
pipeline = Pipeline(steps=[('i', imputer), ('m', model)])
url = 'https://raw.githubusercontent.com/lihkir/Data/main/diabetes.csv'
dataframe = pd.read_csv(url)
dataframe.loc[dataframe["Glucose"] == 0.0, "Glucose"] = np.NAN
dataframe.loc[dataframe["BloodPressure"] == 0.0, "BloodPressure"] = np.NAN
dataframe.loc[dataframe["SkinThickness"] == 0.0, "SkinThickness"] = np.NAN
dataframe.loc[dataframe["Insulin"] == 0.0, "Insulin"] = np.NAN
dataframe.loc[dataframe["BMI"] == 0.0, "BMI"] = np.NAN
dataframe.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148.0 | 72.0 | 35.0 | NaN | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85.0 | 66.0 | 29.0 | NaN | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183.0 | 64.0 | NaN | NaN | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89.0 | 66.0 | 23.0 | 94.0 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137.0 | 40.0 | 35.0 | 168.0 | 43.1 | 2.288 | 33 | 1 |
data = dataframe.values
ix = [i for i in range(data.shape[1])]
X, y = data[:, ix], data[:, 8]
model = RandomForestRegressor()
imputer = IterativeImputer()
pipeline = Pipeline(steps=[('i', imputer), ('m', model)])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
Mean Accuracy: 1.000 (0.000)
El metodo de imputacion es muy bueno y se puede usar para tratar los datos faltantes. Se procede a precedir los valores faltantes.
url = 'https://raw.githubusercontent.com/lihkir/Data/main/diabetes.csv'
dataframe = pd.read_csv(url)
dataframe.loc[dataframe["Glucose"] == 0.0, "Glucose"] = np.NAN
dataframe.loc[dataframe["BloodPressure"] == 0.0, "BloodPressure"] = np.NAN
dataframe.loc[dataframe["SkinThickness"] == 0.0, "SkinThickness"] = np.NAN
dataframe.loc[dataframe["Insulin"] == 0.0, "Insulin"] = np.NAN
dataframe.loc[dataframe["BMI"] == 0.0, "BMI"] = np.NAN
data = dataframe.values
X, y = data[:, :-1], data[:, -1]
pipeline = Pipeline(steps=[('i', IterativeImputer()), ('m', RandomForestClassifier())])
pipeline.fit(X, y)
Pipeline(steps=[('i', IterativeImputer()), ('m', RandomForestClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('i', IterativeImputer()), ('m', RandomForestClassifier())])IterativeImputer()
RandomForestClassifier()
df_imputado=pipeline.named_steps['i'].transform(dataframe.drop("Outcome",axis=1))
C:\Users\g_a09\anaconda3\lib\site-packages\sklearn\base.py:413: UserWarning: X has feature names, but IterativeImputer was fitted without feature names
df_imp=pd.DataFrame(df_imputado, columns=dataframe.drop("Outcome", axis=1).columns, index=dataframe.index)
df_imp["Outcome"]=dataframe["Outcome"]
df_imp.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6.0 | 148.0 | 72.0 | 35.000000 | 218.922943 | 33.6 | 0.627 | 50.0 | 1 |
| 1 | 1.0 | 85.0 | 66.0 | 29.000000 | 70.302975 | 26.6 | 0.351 | 31.0 | 0 |
| 2 | 8.0 | 183.0 | 64.0 | 21.532622 | 268.502252 | 23.3 | 0.672 | 32.0 | 1 |
| 3 | 1.0 | 89.0 | 66.0 | 23.000000 | 94.000000 | 28.1 | 0.167 | 21.0 | 0 |
| 4 | 0.0 | 137.0 | 40.0 | 35.000000 | 168.000000 | 43.1 | 2.288 | 33.0 | 1 |
for columna in df_imp.columns:
n_miss = df_imp[columna].isnull().sum()
perc = n_miss / df.shape[0] * 100
print(f"> Columna '{columna}', Faltantes: {n_miss} ({perc:.1f}%)")
> Columna 'Pregnancies', Faltantes: 0 (0.0%) > Columna 'Glucose', Faltantes: 0 (0.0%) > Columna 'BloodPressure', Faltantes: 0 (0.0%) > Columna 'SkinThickness', Faltantes: 0 (0.0%) > Columna 'Insulin', Faltantes: 0 (0.0%) > Columna 'BMI', Faltantes: 0 (0.0%) > Columna 'DiabetesPedigreeFunction', Faltantes: 0 (0.0%) > Columna 'Age', Faltantes: 0 (0.0%)
Realizando un pipeline con el IterativeImputer y RandomForestClassifier como predictor se imputan los valores faltantes y nos queda un df sin Nans
Ahora para analizar los datos atipicos hacemos usos de varios metodos, tales como : Percentiles, Boxplots, Histograms, Descriptivos
outliers = []
def detect_outliers_zscore(df):
outliers = {}
thres = 3
for column in df.columns:
data = df[column]
mean = np.mean(data)
std = np.std(data)
column_outliers = []
for value in data:
z_score = (value - mean) / std
if np.abs(z_score) > thres:
column_outliers.append(value)
outliers[column] = column_outliers
return outliers
sample_outliers = detect_outliers_zscore(df_imp.drop("Outcome",axis=1))
for column, values in sample_outliers.items():
print(f'Outliers para la columna {column}: {values}')
Outliers para la columna Pregnancies: [15.0, 17.0, 14.0, 14.0] Outliers para la columna Glucose: [] Outliers para la columna BloodPressure: [30.0, 110.0, 122.0, 30.0, 110.0, 110.0, 24.0, 114.0] Outliers para la columna SkinThickness: [60.0, 63.0, 99.0] Outliers para la columna Insulin: [543.0, 846.0, 495.0, 485.0, 495.0, 478.0, 744.0, 680.0, 545.0, 465.0, 579.0, 474.0, 480.0, 600.0, 540.0, 480.0, 510.0] Outliers para la columna BMI: [53.2, 55.0, 67.1, 59.4, 57.3] Outliers para la columna DiabetesPedigreeFunction: [2.288, 1.893, 1.781, 2.329, 1.476, 2.137, 1.731, 1.6, 2.42, 1.699, 1.698] Outliers para la columna Age: [69.0, 72.0, 81.0, 70.0, 69.0]
Estos serian los outliers que se detectaron, los cuales se procederan a cambiar por las medianas de las respectivas columnas
def replace_outliers_with_median(df, outliers):
for column, values in outliers.items():
median = df[column].median()
df[column] = df[column].replace(values, median)
return df
df_imp_wnout = replace_outliers_with_median(df_imp, sample_outliers)
df_imp_wnout.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6.0 | 148.0 | 72.0 | 35.000000 | 218.922943 | 33.6 | 0.6270 | 50.0 | 1 |
| 1 | 1.0 | 85.0 | 66.0 | 29.000000 | 70.302975 | 26.6 | 0.3510 | 31.0 | 0 |
| 2 | 8.0 | 183.0 | 64.0 | 21.532622 | 268.502252 | 23.3 | 0.6720 | 32.0 | 1 |
| 3 | 1.0 | 89.0 | 66.0 | 23.000000 | 94.000000 | 28.1 | 0.1670 | 21.0 | 0 |
| 4 | 0.0 | 137.0 | 40.0 | 35.000000 | 168.000000 | 43.1 | 0.3725 | 33.0 | 1 |
Ya tendriamos nuestro df sin outliers y sin valores Nan
fig1 = px.box(df_imp_wnout, x='Outcome', y='Pregnancies', notched=False,title="Pregnancies")
fig2 = px.box(df_imp_wnout, x='Outcome', y='Glucose', notched=False,title="Glucose")
fig3 = px.box(df_imp_wnout, x='Outcome', y='BloodPressure', notched=False,title="PregnancBloodPressureies")
fig4 = px.box(df_imp_wnout, x='Outcome', y='SkinThickness', notched=False,title="SkinThickness")
fig5 = px.box(df_imp_wnout, x='Outcome', y='Insulin', notched=False,title="Insulin")
fig6 = px.box(df_imp_wnout, x='Outcome', y='BMI', notched=False,title="BMI")
fig7 = px.box(df_imp_wnout, x='Outcome', y='DiabetesPedigreeFunction', notched=False,title="DiabetesPedigreeFunction")
fig8 = px.box(df_imp_wnout, x='Outcome', y='Age', notched=False,title="Age")
fig=make_subplots(rows=2,cols=4)
fig.add_trace(fig1['data'][0], row=1, col=1)
fig.add_trace(fig2['data'][0], row=1, col=2)
fig.add_trace(fig3['data'][0], row=1, col=3)
fig.add_trace(fig4['data'][0], row=1, col=4)
fig.add_trace(fig5['data'][0], row=2, col=1)
fig.add_trace(fig6['data'][0], row=2, col=2)
fig.add_trace(fig7['data'][0], row=2, col=3)
fig.add_trace(fig8['data'][0], row=2, col=4)
fig.update_layout(title='Boxplots de diferentes características por Outcome')
fig.show()
Todavia se logran observar en el boxplot datos atipicos pero no son tan distantes como antes.
Test de Rosner
A = loop.LocalOutlierProbability(df_imp, use_numba=False)
probabilities = A.fit().local_outlier_probabilities
outliers = df_imp[probabilities > 0.9]
outliers
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 62 | 5.0 | 44.0 | 62.0 | 21.018675 | -19.507362 | 25.0 | 0.587 | 36.0 | 0 |
Aplicando el Test de rosner podemos identificar que la persona numero 62 del df presenta datos atipicos, y esto se puede confirmar con la presencia de -19 de insulina, valor que no es posible
Test Hampel
threshold_factor = 3
outliers_df = pd.DataFrame()
for column in df_imp.columns:
mediana = np.median(df_imp[column])
mad = scale.mad(df_imp[column])
threshold = threshold_factor * mad
outliers = df_imp[column][abs(df_imp[column] - mediana) > threshold]
if not outliers.empty:
outliers_df = pd.concat([outliers_df, pd.DataFrame({'column': column, 'index': outliers.index, 'value': outliers.values})])
Cant_Outliers=outliers_df["column"].value_counts()
Cant_Outliers
Outcome 268 DiabetesPedigreeFunction 30 Age 22 Pregnancies 19 Insulin 10 BMI 3 BloodPressure 2 Name: column, dtype: int64
Se puede observar que existen 30 datos atipicos para la diabetes, 22 para el año, 19 datos para pregnancies, 10 insulina, 3 para Bmi y por ultimo 2 para la presion arterial.